//Programming IPT for TOT with no normalization
//Additionally allowing for clustered standard errors

//Syntax: IPT_TOT Y D X1 X2 X3, gen(Dhat_IPT W_IPT Dhat_logit W_logit)
capture program drop IPT_TOT
program define IPT_TOT, eclass
{
  version 10
  syntax varlist(numeric min=3) [if] [in], GENerate(string) [cluster(string)]
  marksample touse
  //Bookkeeping
  gettoken Yvar varlist : varlist
  gettoken Dvar varlist : varlist
  //Yvar is a local with name of dependent variable
  //Dvar is a local with name of binary "treatment" indicator
  //varlist is a local with names of all covariates
  tempvar ok0
  gen `ok0'=`touse'*(1-`Dvar')
  tempvar ok1
  gen `ok1'=`touse'*`Dvar'
  //ok0 and ok1 help in processing, used by MATA below in IPT_MIN
  //Next, manage the generate option
  tokenize `generate'
  local wc: word count `generate'
  if (`wc'!=4) {
    di "Too many or too few variables specified" _newline ///
      "Should specify something like " _newline ///
        "gen(Dhat_IPT W_IPT Dhat_logit W_logit)"
    stop
  }
  else {
    confirm new var `1'
    confirm new var `2'
    confirm new var `3'
    confirm new var `4'
    //Initialize IPT variables
    local Dhat="`1'"
    quietly gen double `Dhat'=. //will store up estimated conditional probabilities
    label var `Dhat' "IPT: P(`Dvar'=1|`varlist')"
    local What="`2'"
    quietly gen double `What'=. //will store up weights for E[Y]
    label var `What' "IPT: weights associated with `Dhat'"
    //Next initialize logit variables
    local Dhat_logit="`3'"
    quietly gen double `Dhat_logit'=. //will store up estimated conditional probabilities
    label var `Dhat_logit' "logit: P(`Dvar'=1|`varlist')"
    local What_logit="`4'"
    quietly gen double `What_logit'=. //will store up weights from logit
    label var `What_logit' "logit: weights associated with `Dhat_logit'"
  }
  //Run logit model for benchmark and starting values
  logit `Dvar' `varlist' if `touse'==1
  tempvar tmp
  quietly predict double `tmp' if `touse'==1
  quietly replace `Dhat_logit' = `tmp' if `touse'==1
  quietly summ `Dvar' if `touse'==1
  quietly replace `What_logit'=((`Dhat_logit')/(1-(`Dhat_logit')))*(1-r(mean))/r(mean) if `touse'==1
  quietly replace `What_logit'=1 if `Dvar'==1 & `touse'==1
  mat b=e(b)
  mata: Xnames= st_matrixcolstripe("b")
  mata: names  = Xnames[.,2] \ "TOT_`Yvar'" \  "baseline_`Yvar'"
  mata: eqname = J(rows(Xnames),1,"")\""\""
  mata: names = eqname,names
  mat b=b,0,0   //add in zeros as placeholders: \mu and \gamma
  mat V=diag(b) //this is junk and just a place-holder
  mata: st_matrixcolstripe("b", names)
  mata: st_matrixrowstripe("V", names)
  mata: st_matrixcolstripe("V", names)
  //Call to MATA to do rest of work
  //Note that IPT_MIN passes back b and V
  mata: IPT_MIN("`Yvar'","`Dvar'","`varlist'","`touse'","`ok0'","`ok1'","`Dhat'","`What'","`cluster'")
  //Note: IPT_MIN returns "logit/IPT" coefficients and estimated population mean in b
  //and also redefines V to be the joint variance matrix for those quantities
  eret post b V, e(`touse') depname(`Dvar')
  eret di
}
end

mata
real matrix IPT_PHI(X, Q, theta)
{
  K=cols(theta)-1
  alpha=theta[1+K]
  beta=theta[1::K]
  mu=alpha:+X*transpose(beta)
  //Construct phi, phi1, phi2
  expmu=exp(mu)
  phi  = expmu*(1-Q)/Q
  phi1  = expmu*(1-Q)/Q
  phi2  = expmu*(1-Q)/Q
  return((phi,phi1,phi2))
}

void IPT_CRIT(todo, theta, crit, g, H)
{
  external X0, Q, Xbar1, K  //"sees" these, even though created in IPT_MIN
  //Xbar1 is the sample mean among the treated
  //X0 is the matrix of covariates among the controls
  phimat = IPT_PHI(X0, Q, theta)
  phi = phimat[.,1]
  //Now define criterion
  alpha=theta[1+K]
  beta=theta[1::K]
  crit=alpha+Xbar1*transpose(beta)-mean(phi) //note that this is the mean over D=1
  if (todo>=1) {
    phi1=phimat[.,2]
    tmp=mean(phi1)
    //tmp
    g=Xbar1:-mean(X0,phi1)*tmp //Xbar row vector, mean(X) row vector
    g=g,1-tmp //add on the derivative w.r.t. alpha
  }
  if (todo==2) {
    phi=phimat[.,1]
    H=J(1+K,1+K,.)
    tmp=mean(phi)
    H[1::K,1::K] = -quadcross(X0,phi,X0)/rows(X0)
    H[K+1,1::K] = -mean(X0,phi)*tmp
    H[1::K,K+1] = -transpose(mean(X0,phi))*tmp
    H[1+K,1+K] = -tmp
  }
//theta
//g
//H
}

void IPT_MIN(string scalar Yvar, string scalar Dvar, string scalar varlist, ///
             string scalar ok, string scalar ok0, string scalar ok1, ///
             string scalar Dhat, string scalar What, string scalar groupvar)
{
  external X0, Q, Xbar1, K //allows IPT_CRIT to "see" these
  st_view(D=., ., tokens(Dvar), ok) //saves memory
  st_view(X=., ., tokens(varlist), ok)
  st_view(X1=., ., tokens(varlist), ok1)
  st_view(X0=., ., tokens(varlist), ok0)
  st_view(Y=., ., tokens(Yvar), ok)
  st_view(Y1=., ., tokens(Yvar), ok1)
  st_view(Y0=., ., tokens(Yvar), ok0)
  N = rows(X)
  N1 = rows(X1)
  Q=N1/N
  K = cols(X)
  Xbar1 = mean(X1)
  //STARTING VALUES
  init1 = st_matrix("b") //logit coefficients: row vector
  init1 = init1[1::1+K] //"b" has zeros tacked on
  //SETUP OPTIMIZATION PROBLEM
  S = optimize_init() //name the programming problem S
  optimize_init_technique(S,"nr") //changes optimization technique
  optimize_init_evaluator(S, &IPT_CRIT()) //associate the objective function w/ S
  optimize_init_which(S,"max") //maximize since problem is TOT
  //optimize_init_evaluatortype(S,"d0") //only use objective function
  //optimize_init_evaluatortype(S,"d1") //use first derivatives
  optimize_init_evaluatortype(S,"d2") //use first and second derivatives
  //CONVERGENCE CRITERIA 
  //Stata uses three types of convergence checks, stopping when all 3 are met
  tol=1e-15
  optimize_init_conv_ptol(S,tol) //1e-15 is pretty seriously punitive, but
  optimize_init_conv_vtol(S,tol) //when the computational problem can be solved,
  optimize_init_conv_nrtol(S,tol) //it can be solved to great precision easily
  //FIX MAX NUMBER OF ITERATIONS
  optimize_init_conv_maxiter(S,30)
  //PASS STARTING VALUES
  //init1=J(1,1+K,0)
  optimize_init_params(S,init1) //these are the logit coefficients
  //OPTIMIZE PLEASE
  beta_alpha = optimize(S) //in keeping w/STATA convention, constant is LAST not first
  //H=optimize_result_Hessian(S)
  //eigenvalues(H) //check if H is positive definite
  //GIVE STATA BACK WEIGHTS, PROPENSITY SCORE
  phimat=IPT_PHI(X, Q, beta_alpha) //use IPT_PHI to get weight for full sample
  W=phimat[.,2] //"DFL" weight
  G=W:/(W:+((1-Q)/Q)) //implied propensity score--W is proportional to (1-Q)/Q, so cancels
  st_store(., Dhat, ok, G) //Spit back to Stata the propensity score
  W=D:+(1:-D):*W //reset to 1 for D=1 subsample
  st_store(., What, ok, W) //Spit back to Stata the weights 
  //beta=beta_alpha[1::K]
  //alpha=beta_alpha[K+1]
  //v=alpha:+X*transpose(beta)
  //G=1:/(1:+exp(-v))

  //ESTIMATE \gamma and \mu
  phimat=IPT_PHI(X0, Q, beta_alpha) //use IPT_PHI to get weight for D=0 group
  W0=phimat[.,2] //"DFL" weight
  gamma=mean(Y1)-mean(Y0,W0)
  mu=mean(Y,W)-gamma*mean(D,W)
  theta=beta_alpha,gamma,mu

  //CONSTRUCT STANDARD ERRORS FOR THETA
  Gover1mG=G:/(1:-G)
  V=G:/(1:-G) //for logit, V is the same as Gover1mG
  Wt=D:+(1:-D):*Gover1mG
  e=(Y:-mu:-gamma:*D)
  uno=J(N,1,1)
  Z1=X,uno
  Z2=D,uno
  m=(D:-(1:-D):*V):*Z1 , e:*Wt:*Z2
  //To compute the variance of m, we need two solutions, one for
  //whether we are clustering and one for when we are not
  if (groupvar=="") {
    B=variance(m) //"meat" of sandwich variance estimator
  }
  else {
    group=st_data(., tokens(groupvar), ok)
    tmp=group,m
    _sort(tmp,1) //sort bigm in place by first column
    V1=tmp[.,1]
    V2=tmp[.,2::cols(tmp)]
    info=panelsetup(V1,1)
    J=rows(info)
    colsm=cols(m)
    term=J(J,colsm*(1+colsm)/2,.)
    for (j=1; j<=J; j++) {
      mvec=panelsubmatrix(V2, j, info)
      sum_mvec=mean(mvec)*rows(mvec)
      term[j,.]=transpose(vech(transpose(sum_mvec)*sum_mvec))
    }
    avg=mean(term)
    B=invvech(transpose(avg))*J^2/((J-1)*N)
  }
  wgt=-(1:-D):*V
  zeros=J(N,1,0)
  Z3=X,uno,zeros,-e
  A1=quadcross(Z3,wgt,Z1)/N
  wgt=-Wt
  A2b=quadcross(Z2,wgt,Z2)/N
  A2=J(1+K,2,0)\A2b
  A=A1,A2
  Ainv=luinv(A) //"bread" of sandwich variance estimator
  V=Ainv*B*transpose(Ainv)/N
  //GIVE STATA BACK POINT ESTIMATES AND VARIANCE MATRIX
  st_replacematrix("b",theta)
  st_replacematrix("V",V)
}

/*
real matrix transpose(real A)
{
  return(A')
}
*/
end

